{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import time\n",
    "import pandas as pd\n",
    "import json\n",
    "import pyreadr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUT_JSON = 'tweets/20210609_combined.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(INPUT_JSON) as f:\n",
    "    data = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def referenced_tweet_ids(tweet):\n",
    "    return [t[\"id\"] for t in tweet.get(\"referenced_tweets\", [])]\n",
    "\n",
    "\n",
    "def referenced_author_ids(tweet):\n",
    "    try:\n",
    "        return [t[\"author_id\"] for t in tweet.get(\"referenced_tweets\", [])]\n",
    "    except:\n",
    "        return []\n",
    "    \n",
    "def referenced_author_username(tweet):\n",
    "    try:\n",
    "        return [t[\"author\"][\"username\"] for t in tweet.get(\"referenced_tweets\", [])]\n",
    "    except:\n",
    "        return []\n",
    "\n",
    "def get_mentions(tweet):\n",
    "    try:\n",
    "        return [m[\"username\"] for m in tweet[\"entities\"][\"mentions\"]]\n",
    "    except KeyError:\n",
    "        return []\n",
    "\n",
    "    \n",
    "def media_urls_from_tweet(tweet):\n",
    "    try:\n",
    "        media = tweet[\"attachments\"][\"media\"]\n",
    "        return [m[\"url\"] for m in media]\n",
    "    except Exception as e:\n",
    "        return []\n",
    "\n",
    "    \n",
    "def referenced_tweet_types(tweet):\n",
    "    return [t[\"type\"] for t in tweet.get(\"referenced_tweets\", {})]\n",
    "\n",
    "\n",
    "def full_referenced_tweet(tweet):\n",
    "    return [t[\"text\"] for t in tweet.get(\"referenced_tweets\", {}) if 'text' in t.keys()]\n",
    "\n",
    "\n",
    "def place_name_tweet(tweet):\n",
    "    try:\n",
    "        place_name = tweet[\"geo\"][\"full_name\"]\n",
    "        return place_name\n",
    "    except Exception as e:\n",
    "        return None\n",
    "    \n",
    "    \n",
    "def place_id_tweet(tweet):\n",
    "    try:\n",
    "        place_id = tweet[\"geo\"][\"place_id\"]\n",
    "        return place_id\n",
    "    except Exception as e:\n",
    "        return None\n",
    "    \n",
    "def extract_df(results):\n",
    "    flattened = [{\n",
    "        \"text\": q[\"text\"], \n",
    "        \"id\": q[\"id\"], \n",
    "        \"tweet_created_at\": q[\"created_at\"],\n",
    "        \"tweet_type\": referenced_tweet_types(q),\n",
    "        \"tweet_retweet_count\": q[\"public_metrics\"][\"retweet_count\"],\n",
    "        \"tweet_reply_count\": q[\"public_metrics\"][\"reply_count\"],\n",
    "        \"tweet_like_count\": q[\"public_metrics\"][\"like_count\"],\n",
    "        \"tweet_quote_count\": q[\"public_metrics\"][\"quote_count\"],\n",
    "        \"referenced_tweet_text\": full_referenced_tweet(q),\n",
    "        \"referenced_tweet_ids\": referenced_tweet_ids(q),\n",
    "        \"referenced_author_ids\": referenced_author_ids(q),\n",
    "        \"referenced_author_username\": referenced_author_username(q),\n",
    "        \"referenced_tweets\": q.get(\"referenced_tweets\", []),\n",
    "        \"place_name\": place_name_tweet(q),\n",
    "        \"place_id\": place_id_tweet(q),\n",
    "        \"author_id\": q[\"author_id\"],\n",
    "        \"author_name\": q[\"author\"][\"name\"],\n",
    "        \"author_username\": q[\"author\"][\"username\"],\n",
    "        \"author_created\": q[\"author\"][\"created_at\"],\n",
    "        \"author_followers\": q[\"author\"][\"public_metrics\"][\"followers_count\"],\n",
    "        \"author_following\": q[\"author\"][\"public_metrics\"][\"following_count\"],\n",
    "        \"author_tweets\": q[\"author\"][\"public_metrics\"][\"tweet_count\"],\n",
    "        \"author_description\": q[\"author\"][\"description\"],\n",
    "        \"author_image_url\": q[\"author\"][\"profile_image_url\"],\n",
    "        \"lang\": q[\"lang\"],\n",
    "        \"conversation_id\": q[\"conversation_id\"],\n",
    "        \"media_url\": media_urls_from_tweet(q),\n",
    "    }\n",
    "        for q in results\n",
    "    ]\n",
    "    return pd.DataFrame(flattened)\n",
    "\n",
    "\n",
    "def merge_results(results):\n",
    "    \n",
    "    dfs = []\n",
    "\n",
    "    for k, v in results.items():\n",
    "        df = extract_df(v)\n",
    "        df[\"query\"] = k\n",
    "        dfs.append(df)\n",
    "\n",
    "    return pd.concat(dfs)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = merge_results(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "today = datetime.today().strftime(\"%Y%m%d\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_pickle(f\"tweets/{today}_scrape.pickle\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
